## ------------------------------------------------------------------------- ##
## Aggregates the household level DHS data for each survey cluster.
## Philippines 2017 DHS.
##
## Additionally, for each of the 2017 DHS clusters, it computes an average
## Wealth Index of its neighbouring locations from the previous 2008 DHS.
## ------------------------------------------------------------------------- ##

# set working directory as appropriate
wd <- "."
setwd(wd)

library(foreign)
library(rgdal)
library(geosphere)


# DHS survey dataset files
# These files can be downloaded from DHS website after registering
# and requesting access:
# https://dhsprogram.com/data/dataset/Philippines_Standard-DHS_2017.cfm?flag=0

dhs_household_survey_file <- "dhs_files/PHHR70DT/PHHR70FL.dta"
geo_folder <- "dhs_files/PHGE71FL"
geo_file <- "PHGE71FL"

# data file
datas <- read.dta(dhs_household_survey_file)

# geo shape files for DHS clusters
geo_datas <- readOGR(dsn = geo_folder, layer = geo_file)

# summary statistics
summary(datas$hv015) # interview status
summary(datas$hv021) # PSU
summary(datas$hv206) # electricity
summary(datas$hv207) # radio
summary(datas$hv208) # TV
summary(datas$hv209) # refrigerator
summary(datas$hv243a) # mobile phone
summary(datas$hv243e) # computer
summary(datas$hv270) # wealth index
summary(datas$hv271) # wealth index factor score
summary(datas$sh121g) # washing machine
summary(datas$sh121h) # air conditioner
summary(datas$sh121k) # cable services

## aggregate the data we need per cluster level
cluster_data <- data.frame(cluster = as.character(geo_datas@data$DHSCLUST))
cluster_data$num_households <- NA
cluster_data$has_electricity <- NA
cluster_data$has_radio <- NA
cluster_data$has_TV <- NA
cluster_data$has_refrigerator <- NA
cluster_data$has_mobile <- NA
cluster_data$has_computer <- NA
cluster_data$cluster_mean_wealth_index <- NA
cluster_data$has_washing_machine <- NA
cluster_data$has_air_conditioner <- NA
cluster_data$has_cable_service <- NA

for (i in 1:nrow(cluster_data)) {
  # the cluster
  clust <- as.character(cluster_data$cluster[i])
  I <- as.character(datas$hv001) == clust
  
  # number households
  num_households <- sum(I)
  cat("Cluster:",clust,"total households:", num_households,"               \r")
  cluster_data$num_households[i] <- num_households
  
  # skip clusters with no households
  if (cluster_data$num_households[i] == 0) { 
    print(paste("Cluster:",clust,"total households:", num_households,"; Skipping cluster."))
    next 
  }
  
  ## asset ownership
  cluster_data$has_electricity[i] <- sum(datas$hv206[I] == "yes")
  cluster_data$has_radio[i] <- sum(datas$hv207[I] == "yes")
  cluster_data$has_TV[i] <- sum(datas$hv208[I] == "yes")
  cluster_data$has_refrigerator[i] <- sum(datas$hv209[I] == "yes")
  cluster_data$has_mobile[i] <- sum(datas$hv243a[I] == "yes")
  cluster_data$has_computer[i] <- sum(datas$hv243e[I] == "yes")
  
  cluster_data$cluster_mean_wealth_index[i] <- mean(datas$hv271[I])
  
  cluster_data$has_washing_machine[i] <- sum(datas$sh121g[I] == "yes")
  cluster_data$has_air_conditioner[i] <- sum(datas$sh121h[I] == "yes")
  cluster_data$has_cable_service[i] <- sum(datas$sh121k[I] == "yes")
}

# compute fraction of households with different assets
assets <- c("has_electricity","has_radio","has_TV","has_refrigerator",
            "has_mobile","has_computer","has_washing_machine",
            "has_air_conditioner","has_cable_service")

for (item in assets) {
  item_frac <- paste(item,"frac",sep="_")
  cluster_data[item_frac] <- cluster_data[,item]/cluster_data$num_households
}

# merge with cluster geographic information
geo_sub <- geo_datas@data[,c("DHSID","DHSCC","DHSYEAR","DHSCLUST",
                             "ADM1DHS","ADM1NAME","DHSREGCO","DHSREGNA","SOURCE","URBAN_RURA","LATNUM","LONGNUM")]
geo_sub$DHSCLUST <- as.character(geo_sub$DHSCLUST)
geo_sub <- merge(x = geo_sub, by.x = "DHSCLUST", 
                 y = cluster_data, by.y = "cluster", all.x = TRUE)

# create the regional indicator variables
dhs_regions <- levels(as.factor(geo_sub$DHSREGNA))
for (reg in dhs_regions) {
  reg_dummy_var <- paste("region_is",reg,sep="_")
  geo_sub[reg_dummy_var] <- ifelse(as.character(geo_sub$DHSREGNA) == reg, 1, 0)
}

# save
write.csv(geo_sub,"Philippines_dhs_dataset.csv", row.names = FALSE)

## ----
## Compute estimates of Wealth Index for the recent DHS
## locations from the interpolated past DHS data.
## ----

geo_datas_dhs17 <- geo_datas[geo_datas$SOURCE != "MIS",]

# DHS 2008 survey dataset files
# Files can be downloaded from here:
# https://dhsprogram.com/data/dataset/Philippines_Standard-DHS_2008.cfm?flag=0
dhs_household_survey_file <- "dhs_files/PHHR52DT/PHHR52FL.dta"
geo_folder <- "dhs_files/PHGE52FL"
geo_file <- "PHGE52FL"

# data file
datas_dhs08 <- read.dta(dhs_household_survey_file)

# geo shape files for DHS clusters
geo_datas_dhs08 <- readOGR(dsn = geo_folder, layer = geo_file)
geo_datas_dhs08 <- geo_datas_dhs08[geo_datas_dhs08$SOURCE != "MIS",]

## aggregate the data we need per cluster level
cluster_data_dhs08 <- data.frame(cluster = as.character(geo_datas_dhs08@data$DHSCLUST))
cluster_data_dhs08$num_households <- NA
cluster_data_dhs08$cluster_mean_wealth_index <- NA

for (i in 1:nrow(cluster_data_dhs08)) {
  # the cluster
  clust <- as.character(cluster_data_dhs08$cluster[i])
  I <- as.character(datas_dhs08$hv001) == clust
  
  # number households
  num_households <- sum(I)
  cat("Cluster:",clust,"total households:", num_households,"               \r")
  cluster_data_dhs08$num_households[i] <- num_households
  
  # skip clusters with no households
  if (cluster_data_dhs08$num_households[i] == 0) { 
    print(paste("Cluster:",clust,"total households:", num_households,"; Skipping cluster."))
    next 
  }
  
  cluster_data_dhs08$cluster_mean_wealth_index[i] <- mean(datas_dhs08$hv271[I])
}

geo_datas_dhs08@data$cluster_mean_wealth_index <- cluster_data_dhs08$cluster_mean_wealth_index


## Interpolate the 2008 DHS Wealth Index values to the 2017 DHS
dist_mat <- matrix(0,nrow=nrow(geo_datas_dhs17), ncol = nrow(geo_datas_dhs08))
for (i in 1:nrow(geo_datas_dhs17)) {
  d <- distGeo(geo_datas_dhs17[i,],geo_datas_dhs08)
  dist_mat[i,] <- d
}

k <- 5 # interpolation from k-nearest neighbours

dhs_interpolated <- data.frame(clusterid = geo_datas_dhs17$DHSID,
                               Interpolated_Wealth_Index_past_dhs = NA,
                               stringsAsFactors = FALSE)
for (i in 1:nrow(geo_datas_dhs17)) {
  knn <- head(order(dist_mat[i,]),k) # we want the indices of nearest k points
  dhs_interpolated$Interpolated_Wealth_Index_past_dhs[i] <- 
    mean(geo_datas_dhs08$cluster_mean_wealth_index[knn])
}

## add this to the DHS dataset and save
geo_sub$Interpolated_Wealth_Index_past_dhs <- NA
for (i in 1:nrow(geo_sub)) {
  I <- dhs_interpolated$clusterid == geo_sub$DHSID[i]
  if (sum(I) == 1) {
    geo_sub$Interpolated_Wealth_Index_past_dhs[i] <- dhs_interpolated$Interpolated_Wealth_Index_past_dhs[I]
  }
}

write.csv(geo_sub,"Philippines_dhs_dataset.csv", row.names = FALSE)

